package nl.peterbloem.powerlaws;

import java.util.Collection;
import java.util.List;

import nl.peterbloem.util.Generator;

/**
 * A power law is a distribution of two parameters, xMin and exponent. The power 
 * law roughly generates valiues above xMin with probability (x/xMin) ^ -exponent.
 * 
 * 
 * @author Peter
 *
 */
public interface PowerLaw<T extends Number> extends Generator<T>
{
	
	/**
	 * The exponent for the power law
	 * 
	 * @return
	 */
	public double exponent();
	
	/**
	 * The xMin parameter for the power law distribution
	 * @return
	 */
	public T xMin();
	
	/**
	 * The probability for element x. If T is continuous, this function shoudl 
	 * return a probability density.
	 * 
	 * @param x
	 * @return
	 */
	public double p(T x);
	
	/**
	 * The cumulative distribution function P(X <= x).
	 * 
	 * @param x
	 * @return
	 */
	public double cdf(T x);
	
	/**
	 * The complement to the cumulative distribution function R(X > x).
	 * 
	 * @param x
	 * @return
	 */
	public double cdfComp(T x); 
	
	/**
	 * Generates a dataset like the observed data is a semi parametric manner
	 * 
	 * If there are nt points in the tail (points x with x >= xmin), then with 
	 * probability nt/n this method returns a point generated by the generate() 
	 * method and with probability 1.0 - nt/n it returns a random point x from 
	 * the observed data so that x <= xMin.
	 * 
	 * @param observed
	 * @param number 
	 * @return
	 */		
	public List<T> generate(Collection<? extends T> observed, int number);

	/**
	 * Perform the Kolmogorov-Smirnov test on the given data.
	 * 
	 * @param data
	 * @return
	 */
	public double ksTest(Collection<? extends T> data);
	
	
	/**
	 * Calculates the significance of of this model as a hypothesis for the 
	 * given data. Based on Clauset 2007 section 4.
	 * 
	 * The basci method for calculating significance should be as follows: 
	 * First compute the KS distance d for this model and the data. Then
	 * generate n datasets for this distribution (same size as the argument),
	 * fit a model to each and calculate its KS distance, and return the 
	 * proportion of generated datasets with distance lower than d.
	 * 
	 * @param data
	 * @return
	 */
	public double significance(Collection<? extends T> data, int sampleSize);
	
	/**
	 * Calculates the significance of of this model as a hypothesis for the 
	 * given data. Based on Clauset 2007 section 4.
	 * 
	 * The basci method for calculating significance should be as follows: 
	 * First compute the KS distance d for this model and the data. Then
	 * generate n datasets for this distribution (same size as the argument),
	 * fit a model to each and calculate its KS distance, and return the 
	 * proportion of generated datasets with distance lower than d.
	 * 
	 * The value of n is based on the requested accuracy epsilon, by the 
	 * relation n = (1/4) * epsilon^-2. (Clauset pg. 677).
	 * 
	 * @param data
	 * @return
	 */	
	public double significance(Collection<? extends T> data, double epsilon);
	
	public double significance(Collection<? extends T> data, int sampleSize, int dataSamples);

	public double significance(Collection<? extends T> data, double epsilon, int dataSamples);
	
	/**
	 * Represents the intermediate stage of fitting a power law to data. 
	 * 
	 * The function fit() returns the best model.
	 * 
	 * @author Peter
	 *
	 */
	public interface Fit<T extends Number, P extends PowerLaw<T>>
	{
		/**
		 * Estimate the best model.
		 * @return
		 */
		public P fit();
		
		/**
		 * Performs the MLE estimator for the exponent on all data but only uses
	 	 * a subsample of the data to find x. The subsample is created by 
	 	 * sorting the data and choosing elements at regular intervals in the 
	 	 * indexing so that the total size is approximately equal to 
	 	 * the parameter samples.  
		 * 
		 * @param samples
		 * @return
		 */
		public P fitSampled(int samples);
		
		/**
		 * Estimate a power law from the data with appropriate xMin.
		 * 
		 * @param xMin
		 * @return
		 */
		public P fit(T xMin);
	}
}
